{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from simpletransformers.classification import ClassificationModel, ClassificationArgs\n",
    "import pandas as pd\n",
    "from tqdm.notebook import tqdm\n",
    "import logging\n",
    "import editdistance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "logging.basicConfig(level=logging.INFO)\n",
    "transformers_logger = logging.getLogger(\"transformers\")\n",
    "transformers_logger.setLevel(logging.WARNING)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "04dbd004e1ce455086306281d46e7f1d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/18800 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4468f71b0ac24fd6bed01caf209d5f79",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2000 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# prepare data\n",
    "train_file = \"data/fake/train.csv\"\n",
    "train_data = []\n",
    "with open(train_file, \"r\", encoding=\"utf-8\") as f:\n",
    "    for line in tqdm(f.readlines()):\n",
    "        line = line.strip().split(\"\\t\")\n",
    "        line[1] = int(line[1]) - 1\n",
    "        train_data.append(line)\n",
    "\n",
    "test_file = \"data/fake/test.csv\"\n",
    "test_data = []\n",
    "with open(test_file, \"r\", encoding=\"utf-8\") as f:\n",
    "    for line in tqdm(f.readlines()):\n",
    "        line = line.split(\"\\t\")\n",
    "        line[1] = int(line[1]) - 1\n",
    "        test_data.append(line)\n",
    "\n",
    "train_df = pd.DataFrame(train_data)\n",
    "eval_df = pd.DataFrame(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
      "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "# setup model\n",
    "\n",
    "model_args = {\n",
    "    \"num_train_epochs\":3,\n",
    "    \"max_seq_length\":256,\n",
    "    \"train_batch_size\":16,\n",
    "    \"output_dir\":\"\"\n",
    "}\n",
    "\n",
    "model = ClassificationModel(\"bert\", \"bert-base-uncased\", args=model_args, use_cuda=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/LAB/chenty/miniconda3/envs/benchmark/lib/python3.8/site-packages/simpletransformers/classification/classification_model.py:445: UserWarning: Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels.\n",
      "  warnings.warn(\n",
      "INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9ca83a951aa14c5abbe5132d3f64b6ee",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/38 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_bert_256_2_18800\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e6125406664849eb8358313ff76bac90",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Epoch:   0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "aa8ac4ecc90f41959c00dff99fff9913",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Running Epoch 0 of 3:   0%|          | 0/1175 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/LAB/chenty/miniconda3/envs/benchmark/lib/python3.8/site-packages/torch/optim/lr_scheduler.py:131: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`.  Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate\n",
      "  warnings.warn(\"Detected call of `lr_scheduler.step()` before `optimizer.step()`. \"\n",
      "/home/LAB/chenty/miniconda3/envs/benchmark/lib/python3.8/site-packages/torch/optim/lr_scheduler.py:216: UserWarning: Please also save or load the state of the optimizer when saving or loading the scheduler.\n",
      "  warnings.warn(SAVE_STATE_WARNING, UserWarning)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b754823b793445feb06191f492b7f7d2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Running Epoch 1 of 3:   0%|          | 0/1175 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "715d7eeb88364934b187c9f0063da474",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Running Epoch 2 of 3:   0%|          | 0/1175 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to outputs/.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(3525, 0.02760333667425765)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.train_model(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/LAB/chenty/miniconda3/envs/benchmark/lib/python3.8/site-packages/simpletransformers/classification/classification_model.py:1025: UserWarning: Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels.\n",
      "  warnings.warn(\n",
      "INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.\n",
      "INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_dev_bert_256_2_2000\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e06cb510b57c430eb6e92449ce92e75c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Running Evaluation:   0%|          | 0/250 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "INFO:simpletransformers.classification.classification_model:{'mcc': 0.9970004925023529, 'tp': 997, 'tn': 1000, 'fp': 2, 'fn': 1, 'auroc': 0.9999889999559999, 'auprc': 0.9999889244689254, 'eval_loss': 0.007021561433852184}\n"
     ]
    }
   ],
   "source": [
    "result, model_outputs, wrong_predictions = model.eval_model(eval_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "acc = (len(test_data) -len(wrong_predictions))  / len(test_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9985"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "acc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'ClassificationModel' object has no attribute 'save_pretrained'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-16-c5a9993313d1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"models/fake\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/miniconda3/envs/benchmark/lib/python3.8/site-packages/simpletransformers/classification/classification_model.py\u001b[0m in \u001b[0;36msave_model\u001b[0;34m(self, output_dir, optimizer, scheduler, model, results)\u001b[0m\n\u001b[1;32m   1711\u001b[0m             \u001b[0;31m# Take care of distributed/parallel training\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1712\u001b[0m             \u001b[0mmodel_to_save\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodule\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"module\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1713\u001b[0;31m             \u001b[0mmodel_to_save\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1714\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1715\u001b[0m             \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"training_args.bin\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'ClassificationModel' object has no attribute 'save_pretrained'"
     ]
    }
   ],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
